Download course materials from bit.ly/Leeds_2019-02-28.
install.packages(tidyverse,ordinal,lme4)
Dataset
sleep:
head(sleep)
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
What does this do?
sleep[1,]
## extra group ID
## 1 0.7 1 1
How does this differ from [1,]?
sleep[2,] # so that means this is row 2
## extra group ID
## 2 -1.6 1 2
How does this differ from [3,]?
sleep[,3] # what do you think this is?
## [1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
## Levels: 1 2 3 4 5 6 7 8 9 10
…which makes it dataset[row,column]
You can also navigate with column names:
sleep$ID
## [1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
## Levels: 1 2 3 4 5 6 7 8 9 10
How would you view the column extra?
sleep$extra
## [1] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0 1.9 0.8 1.1 0.1
## [15] -0.1 4.4 5.5 1.6 4.6 3.4
Use str() to get a summary of the structure of the dataset
str(sleep)
## 'data.frame': 20 obs. of 3 variables:
## $ extra: num 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
## $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ID : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
What are all the unique values in ID?
unique(sleep$extra)
## [1] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0 1.9 1.1 0.1 4.4
## [15] 5.5 1.6 4.6
What’s the value in the first row, third column?
sleep[1,3]
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9 10
What’s the first element in the column ID?
sleep[1,]$ID
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9 10
sleep$ID[1]
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9 10
You can also view the dataset as a spreadsheet (although it can’t be altered).
View(sleep)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
A tibble is different from a table.
as.tibble(sleep)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
## # A tibble: 20 x 3
## extra group ID
## <dbl> <fct> <fct>
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
## 7 3.7 1 7
## 8 0.8 1 8
## 9 0 1 9
## 10 2 1 10
## 11 1.9 2 1
## 12 0.8 2 2
## 13 1.1 2 3
## 14 0.1 2 4
## 15 -0.1 2 5
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
Pipes are like toy funnels
%>%
sleep %>%
head()
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
# or
sleep %>%
head
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
How many attestations of each type of group?
sleep %>%
count(group)
## # A tibble: 2 x 2
## group n
## <fct> <int>
## 1 1 10
## 2 2 10
How can you make a new column? Duplicate group into group2:
sleep %>%
mutate(group2 = group)
## extra group ID group2
## 1 0.7 1 1 1
## 2 -1.6 1 2 1
## 3 -0.2 1 3 1
## 4 -1.2 1 4 1
## 5 -0.1 1 5 1
## 6 3.4 1 6 1
## 7 3.7 1 7 1
## 8 0.8 1 8 1
## 9 0.0 1 9 1
## 10 2.0 1 10 1
## 11 1.9 2 1 2
## 12 0.8 2 2 2
## 13 1.1 2 3 2
## 14 0.1 2 4 2
## 15 -0.1 2 5 2
## 16 4.4 2 6 2
## 17 5.5 2 7 2
## 18 1.6 2 8 2
## 19 4.6 2 9 2
## 20 3.4 2 10 2
Let’s rename the levels in group to be “one” and “two”:
sleep %>%
mutate(group2 = case_when(group==1 ~ "one",
TRUE ~ "two"))
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 two
## 12 0.8 2 2 two
## 13 1.1 2 3 two
## 14 0.1 2 4 two
## 15 -0.1 2 5 two
## 16 4.4 2 6 two
## 17 5.5 2 7 two
## 18 1.6 2 8 two
## 19 4.6 2 9 two
## 20 3.4 2 10 two
What’s wrong with this one?
sleep %>%
mutate(group2 = case_when(group==1 ~ as.factor("one"),
group==2 ~ as.factor("two")))
## Warning in `[<-.factor`(`*tmp*`, i, value = structure(1L, .Label = "two",
## class = "factor")): invalid factor level, NA generated
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 <NA>
## 12 0.8 2 2 <NA>
## 13 1.1 2 3 <NA>
## 14 0.1 2 4 <NA>
## 15 -0.1 2 5 <NA>
## 16 4.4 2 6 <NA>
## 17 5.5 2 7 <NA>
## 18 1.6 2 8 <NA>
## 19 4.6 2 9 <NA>
## 20 3.4 2 10 <NA>
sleep %>%
mutate(group2 = case_when(group==1 ~"one",
group==2 ~"two")) %>%
mutate(group2 = as.factor(group2))
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 two
## 12 0.8 2 2 two
## 13 1.1 2 3 two
## 14 0.1 2 4 two
## 15 -0.1 2 5 two
## 16 4.4 2 6 two
## 17 5.5 2 7 two
## 18 1.6 2 8 two
## 19 4.6 2 9 two
## 20 3.4 2 10 two
Now, let’s recreate the count function with group_by and summarise:
sleep %>%
group_by(group) %>%
summarise(n = n())
## # A tibble: 2 x 2
## group n
## <fct> <int>
## 1 1 10
## 2 2 10
We cal use group_by and summarise to do a lot more than just count:
# mean value of `extra` by `group2`
sleep %>%
mutate(group2 = case_when(group==1 ~"one",
group==2 ~"two")) %>%
mutate(group2 = as.factor(group2)) %>%
group_by(group) %>%
summarise(effectMean = mean(extra))
## # A tibble: 2 x 2
## group effectMean
## <fct> <dbl>
## 1 1 0.75
## 2 2 2.33
Dataset
quakes:
What does quakes look like?
quakes %>%
str
## 'data.frame': 1000 obs. of 5 variables:
## $ lat : num -20.4 -20.6 -26 -18 -20.4 ...
## $ long : num 182 181 184 182 182 ...
## $ depth : int 562 650 42 626 649 195 82 194 211 622 ...
## $ mag : num 4.8 4.2 5.4 4.1 4 4 4.8 4.4 4.7 4.3 ...
## $ stations: int 41 15 43 19 11 12 43 15 35 19 ...
# View(quakes)
How many observations are there per “level” of magnitude?
quakes %>%
group_by(mag) %>%
summarise(numberOfObvs = n())
## # A tibble: 22 x 2
## mag numberOfObvs
## <dbl> <int>
## 1 4 46
## 2 4.1 55
## 3 4.2 90
## 4 4.3 85
## 5 4.4 101
## 6 4.5 107
## 7 4.6 101
## 8 4.7 98
## 9 4.8 65
## 10 4.9 54
## # … with 12 more rows
Let’s create a table of the means, standard deviations, and standard errors for both stations reporting and depths:
quakes %>%
group_by(mag) %>%
summarise(n = n(),
stationMean = mean(stations),
stationSD = sd(stations),
stationSE = sd(stations)/sqrt(n),
depthMean = mean(depth),
depthSD = sd(depth),
depthSE = sd(depth)/sqrt(n))
## # A tibble: 22 x 8
## mag n stationMean stationSD stationSE depthMean depthSD depthSE
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 46 14.9 4.25 0.626 410. 170. 25.0
## 2 4.1 55 15.7 4.39 0.591 412. 201. 27.1
## 3 4.2 90 18.4 5.54 0.584 390. 186. 19.7
## 4 4.3 85 19.3 5.94 0.644 358. 204. 22.1
## 5 4.4 101 22.3 8.06 0.802 307. 205. 20.4
## 6 4.5 107 24.4 8.31 0.803 334. 214. 20.7
## 7 4.6 101 27.3 9.67 0.962 331. 224. 22.3
## 8 4.7 98 31.2 9.11 0.921 238. 204. 20.6
## 9 4.8 65 36.8 11.2 1.38 229. 199. 24.7
## 10 4.9 54 42.9 13.3 1.80 248. 228. 31.0
## # … with 12 more rows
This dataset is wide. Let’s make it long using gather (compare to spread):
quakes %>%
group_by(mag) %>%
summarise(n = n(),
stationMean = mean(stations),
stationSD = sd(stations),
stationSE = sd(stations)/sqrt(n),
depthMean = mean(depth),
depthSD = sd(depth),
depthSE = sd(depth)/sqrt(n)) %>%
gather(measurement, values, 3:8) #-> quakesLong
## # A tibble: 132 x 4
## mag n measurement values
## <dbl> <int> <chr> <dbl>
## 1 4 46 stationMean 14.9
## 2 4.1 55 stationMean 15.7
## 3 4.2 90 stationMean 18.4
## 4 4.3 85 stationMean 19.3
## 5 4.4 101 stationMean 22.3
## 6 4.5 107 stationMean 24.4
## 7 4.6 101 stationMean 27.3
## 8 4.7 98 stationMean 31.2
## 9 4.8 65 stationMean 36.8
## 10 4.9 54 stationMean 42.9
## # … with 122 more rows
# https://bookdown.org/ndphillips/YaRrr/pirateplot.html
library(tidyverse)
# install.packages("ggbeeswarm")
library(ggbeeswarm)
Dataset
iris:
The function ggplot layers different geometries and aesthetics to build up a plot:
iris %>%
mutate(Sepal.Width = Sepal.Width+rnorm(length(Sepal.Width),0,.1))%>%
ggplot(aes(x=Species,y=Sepal.Width,fill=Species)) +
geom_violin(lty=0,alpha=.5)+
geom_boxplot(alpha=0.5,lwd=.5) +
geom_quasirandom(dodge.width=1, alpha=.5)
In what ways might we change this plot?
iris %>%
mutate(
Sepal.Width = Sepal.Width +
rnorm(
length(Sepal.Width) , 0 , .1
)
) %>%
ggplot(aes(x=Species,y=Sepal.Width)) +
geom_boxplot() +
geom_quasirandom(aes(pch=Species), alpha=.5)
#ggplot(aes(x=Species,y=Sepal.Width,fill=Species)) +
#geom_violin(lty=0,alpha=.5)+
#geom_boxplot(alpha=0.5,lwd=.5) +
#geom_quasirandom(dodge.width=1, alpha=.5)
Going back to quakes, let’s pipe our table into a ggplot (and fill in missing values):
quakes %>%
group_by(mag) %>%
summarise(n=n(),
sta=mean(stations),
staSD=sd(stations),
staSE=staSD/sqrt(n),
dep=mean(depth),
depSD=sd(depth),
depSE=depSD/sqrt(n)) %>%
ggplot(aes(x=mag)) +
geom_point(aes(y=sta),colour="red")+
geom_path(aes(y=sta),colour="red")+
geom_ribbon(aes(ymin=sta-staSD,ymax=sta+staSD),fill="red",alpha=.2) +
geom_ribbon(aes(ymin=sta-staSE,ymax=sta+staSE),fill="red",alpha=.5) +
geom_point(aes(y=dep),colour="blue")+
geom_path(aes(y=dep),colour="blue")+
geom_ribbon(aes(ymin=dep-depSD,ymax=dep+depSD),fill="blue",alpha=.2) +
geom_ribbon(aes(ymin=dep-depSE,ymax=dep+depSE),fill="blue",alpha=.5) +
theme_bw() + ylab("depth OR number of stations reporting")
Inheritance:
ggplot(data=quakes, aes(x=long)) +
geom_point(aes(y=lat,colour=stations)) #+
#geom_path()
See Simulated Data notebook
data <- read_csv("../data/simulated-data.csv")
## Parsed with column specification:
## cols(
## subj = col_double(),
## age = col_double(),
## item = col_double(),
## freq = col_character(),
## gram = col_character(),
## rating = col_double(),
## accuracy = col_double(),
## region = col_double(),
## word = col_character(),
## rt = col_double()
## )
Plot boxplots and violin plots for the ratings. Subset by participant.
data %>%
filter(region == 1) %>%
ggplot(aes(x=gram, y=rating, fill=freq)) +
geom_boxplot(position = position_dodge(.9)) +
#geom_violin(alpha=.5) +
facet_wrap(~subj)
Group by region, word, frequency, and grammaticality. Summarise mean and standard error.
data %>%
group_by(region, word, freq, gram) %>%
summarise(meanRT = mean(rt),
seRT = sd(rt)/sqrt(n())) %>%
ggplot(aes(x=region,y=meanRT,colour=gram)) +
geom_point() +
geom_path(aes(lty=freq)) +
geom_errorbar(aes(ymin=meanRT-seRT, ymax=meanRT+seRT),width=.1) +
scale_x_continuous(breaks=c(1:5),labels = c("the","old","VERB","the","boat")) +
ylab("reading time (miliseconds)")
Well, check out the Datasaurus Dozen, which all have the same x/y means and standard deviations: